package org.swu.swuse.Processor;

import java.util.Date;

import org.swu.swuse.model.WebPage;
import org.swu.swuse.model.WebSpiderConfig;
import org.swu.swuse.utils.DateUtils;
import org.swu.swuse.utils.HTMLSpirit;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;

/**
 * 按照WebSpiderConfig模型定义的规则来进行数据爬取
 * 
 * @author zhanjingbo
 *
 */
public class SpiderProcessor implements PageProcessor {

	private Site site = Site.me().setRetryTimes(3).setSleepTime(100);
	private WebSpiderConfig webSpiderConfig;

	public SpiderProcessor(WebSpiderConfig webSpiderConfig) {
		this.webSpiderConfig = webSpiderConfig;
	}

	public void process(Page page) {

		// 获取符合要求的URL 添加至爬取队列
		page.addTargetRequests(page.getHtml().links().regex(webSpiderConfig.getTargetRegular()).all());
		// 实例化一个WebPage模型 指定对应的信息
		WebPage webpage = new WebPage();

		webpage.setSource(webSpiderConfig.getName());
		webpage.setUrl(page.getUrl().toString());
		webpage.setHtml(page.getHtml().toString());
		webpage.setText(HTMLSpirit.delHTMLTag(webpage.getHtml()));
		webpage.setTitle(page.getHtml().$("title").toString());
		webpage.setDate(new Date());

		// 如果当前页面为目标内容页面 进行特殊数据爬取
		if (webpage.getUrl().matches(webSpiderConfig.getContentRegular())) {
			String text = HTMLSpirit.delHTMLTag(page.getHtml().xpath(webSpiderConfig.getTextRegular()).toString());
			if (text != null) {
				webpage.setText(text);
			}
			String title = page.getHtml().xpath(webSpiderConfig.getTitleRegular()).toString();
			if (title != null) {
				webpage.setTitle(title);
			}

			Date date = DateUtils.getDate(page.getHtml().xpath(webSpiderConfig.getTitleRegular()).toString(),
					webSpiderConfig.getSdfRegular());
			webpage.setDate(date);

		}

		if (webpage.getHtml() == null) {
			page.setSkip(true);
		} else {
			page.putField("webpage", webpage);
		}
	}

	public Site getSite() {
		return site;
	}

}
